# soil_trt effects (coef=c("SBC_OLD"))
twoafternoon.trtlive.DEGs.all.v3.0anno <- read_csv(file.path("..","output","twoafternoon.trtlive.DEGs.all.v3.0anno.csv"))
## Parsed with column specification:
## cols(
## genes = col_character(),
## logFC = col_double(),
## logCPM = col_double(),
## LR = col_double(),
## PValue = col_double(),
## FDR = col_double(),
## AGI = col_character(),
## At_symbol = col_character(),
## At_short_description = col_character(),
## perc_ID = col_double()
## )
# interaction, coef = str_which(colnames(dge.twoafternoon.design),"soil_trtSBC_OLD:")
twoafternoon.interaction.trtlive.samplingday.lrt.DEGs.all.v3.0anno <- read_csv(file=file.path("..","output","twoafternoon.interaction.trtlive.samplingday.lrt.DEGs.all.v3.0anno.csv"))
## Parsed with column specification:
## cols(
## genes = col_character(),
## logFC.soil_trtSBC_OLD.sampling_day03 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day04 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day06 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day08 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day10 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day13 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day14 = col_double(),
## logCPM = col_double(),
## LR = col_double(),
## PValue = col_double(),
## FDR = col_double(),
## AGI = col_character(),
## At_symbol = col_character(),
## At_short_description = col_character(),
## perc_ID = col_double()
## )
# any soil_trt effects, coef = str_which(colnames(dge.twoafternoon.design),"soil_trtSBC_OLD")
twoafternoon.any.trtlive.DEGs.all.v3.0anno <- read_csv(file=file.path("..","output","twoafternoon.any.trtlive.DEGs.all.v3.0anno.csv"))
## Parsed with column specification:
## cols(
## genes = col_character(),
## logFC.soil_trtSBC_OLD = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day03 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day04 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day06 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day08 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day10 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day13 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_day14 = col_double(),
## logCPM = col_double(),
## LR = col_double(),
## PValue = col_double(),
## FDR = col_double(),
## AGI = col_character(),
## At_symbol = col_character(),
## At_short_description = col_character(),
## perc_ID = col_double()
## )
# treatment only
dge.diurnal34.trtlive.DEGs.all.v3.0anno <- read_csv(file=file.path("..","output","dge.diurnal34.trtlive.DEGs.v3.0anno.csv")) # updated
## Parsed with column specification:
## cols(
## genes = col_character(),
## logFC = col_double(),
## logCPM = col_double(),
## LR = col_double(),
## PValue = col_double(),
## FDR = col_double(),
## AGI = col_character(),
## At_symbol = col_character(),
## At_short_description = col_character(),
## perc_ID = col_double()
## )
dge.diurnal1314.trtlive.DEGs.all.v3.0anno <- read_csv(file=file.path("..","output","dge.diurnal1314.trtlive.DEGs.all.v3.0anno.csv")) # updated
## Parsed with column specification:
## cols(
## genes = col_character(),
## logFC = col_double(),
## logCPM = col_double(),
## LR = col_double(),
## PValue = col_double(),
## FDR = col_double(),
## AGI = col_character(),
## At_symbol = col_character(),
## At_short_description = col_character(),
## perc_ID = col_double()
## )
# treatment and treatment:time interaction
dge.diurnal34.any.trtlive.DEGs.all.v3.0anno <- read_csv(file=file.path("..","output","dge.diurnal34.any.trtlive.DEGs.all.v3.0anno.csv"))
## Parsed with column specification:
## cols(
## genes = col_character(),
## logFC.soil_trtSBC_OLD = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time2_afternoon = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time3_evening_5.30 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time4_night_1 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time5_night_2 = col_double(),
## logCPM = col_double(),
## LR = col_double(),
## PValue = col_double(),
## FDR = col_double(),
## AGI = col_character(),
## At_symbol = col_character(),
## At_short_description = col_character(),
## perc_ID = col_double()
## )
dge.diurnal1314.any.trtlive.DEGs.all.v3.0anno <- read_csv(file=file.path("..","output","dge.diurnal1314.any.trtlive.DEGs.all.v3.0anno.csv"))
## Parsed with column specification:
## cols(
## genes = col_character(),
## logFC.soil_trtSBC_OLD = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time2_afternoon = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time3_evening_5.30 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time4_night_1 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time5_night_2 = col_double(),
## logCPM = col_double(),
## LR = col_double(),
## PValue = col_double(),
## FDR = col_double(),
## AGI = col_character(),
## At_symbol = col_character(),
## At_short_description = col_character(),
## perc_ID = col_double()
## )
# interaction (coef = str_which(colnames(dge.diurnal34.design),":"))
dge.diurnal34.interaction.trtlive.time.DEGs.all.v3.0anno<-read_csv(file=file.path("..","output","dge.diurnal34.interaction.trtlive.time.DEGs.all.v3.0anno.csv")) # updated
## Parsed with column specification:
## cols(
## genes = col_character(),
## logFC.soil_trtSBC_OLD.sampling_time2_afternoon = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time3_evening_5.30 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time4_night_1 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time5_night_2 = col_double(),
## logCPM = col_double(),
## LR = col_double(),
## PValue = col_double(),
## FDR = col_double(),
## AGI = col_character(),
## At_symbol = col_character(),
## At_short_description = col_character(),
## perc_ID = col_double()
## )
# interaction (coef = str_which(colnames(dge.diurnal1314.design),":"))
dge.diurnal1314.interaction.trtlive.time.DEGs.all.v3.0anno<-read_csv(file=file.path("..","output","dge.diurnal1314.interaction.trtlive.time.DEGs.all.v3.0anno.csv")) # updated
## Parsed with column specification:
## cols(
## genes = col_character(),
## logFC.soil_trtSBC_OLD.sampling_time2_afternoon = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time3_evening_5.30 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time4_night_1 = col_double(),
## logFC.soil_trtSBC_OLD.sampling_time5_night_2 = col_double(),
## logCPM = col_double(),
## LR = col_double(),
## PValue = col_double(),
## FDR = col_double(),
## AGI = col_character(),
## At_symbol = col_character(),
## At_short_description = col_character(),
## perc_ID = col_double()
## )
# check
dge.diurnal34.any.trtlive.DEGs.all.v3.0anno %>% filter(FDR<0.05) %>% dim() # [1] 4593 14
## [1] 4593 14
# select genes with higher CV
## classic way
co.var.df <- function(x) ( 100*apply(x,1,sd)/rowMeans(x) )
cpm.timecourse.v3.0$cv<-co.var.df(cpm.timecourse.v3.0[,-1])
# tidyverse way (no working)
#cpm.timecourse.v3.0 %>% slice(1:100) %>% select(-1) %>% group_by(%>% mutate(cv=map(.,co.var.df ))
a<-hist(cpm.timecourse.v3.0$cv)
a
## $breaks
## [1] 0 50 100 150 200 250 300 350 400 450 500 550 600 650 700
##
## $counts
## [1] 19889 5976 989 270 87 45 24 12 7 4 2 1
## [13] 1 1
##
## $density
## [1] 1.456643e-02 4.376739e-03 7.243299e-04 1.977443e-04 6.371759e-05
## [6] 3.295738e-05 1.757727e-05 8.788633e-06 5.126703e-06 2.929544e-06
## [11] 1.464772e-06 7.323861e-07 7.323861e-07 7.323861e-07
##
## $mids
## [1] 25 75 125 175 225 275 325 375 425 475 525 575 625 675
##
## $xname
## [1] "cpm.timecourse.v3.0$cv"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
# there are genes with extream value
cpm.timecourse.v3.0 %>% filter(cv>600)
# Check expression pattern
expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2(target.genes = cpm.timecourse.v3.0 %>% dplyr::filter(cv>450) %>% dplyr::slice(1:20)) ->p
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
p
ggsave(filename="../output/highCV.absvalue.genes.expression.png",width=11,height=8) # should I remove them????
#
sum(as.integer(cpm.timecourse.v3.0$cv>30))/dim(cpm.timecourse.v3.0)[1] # [1] 0.5207265
## [1] 0.5207265
sum(as.integer(cpm.timecourse.v3.0$cv>40))/dim(cpm.timecourse.v3.0)[1] # [1] 0.3725282. Larger CV than SAS timecourse data ()??? Due to non log absolute expression value.
## [1] 0.3725282
# cf. sum(as.integer(SAS.expression.vst.s.kazu$cv>4.5))/dim(SAS.expression.vst.s.kazu)[1] #[1] 0.2300789
cpm.timecourse.v3.0.log$cv<-co.var.df(cpm.timecourse.v3.0.log[,-1])
b<-hist(cpm.timecourse.v3.0.log$cv)
b
# use largeCV
cpm.timecourse.v3.0.log.largeCV<-cpm.timecourse.v3.0.log[cpm.timecourse.v3.0[cpm.timecourse.v3.0$cv>40,"transcript_ID"],]
dim(cpm.timecourse.v3.0.log.largeCV) # [1] 17262 289 > [1] 10173 290 (02/01/2020) (cf. SAS.expression.vst.s.kazu.largeCV is 7025 288)
c<-hist(cpm.timecourse.v3.0.log.largeCV$cv)
c
###########
#save(cpm.timecourse.v3.0.log.largeCV,file=file.path("..","output","cpm.timecourse.v3.0.log.largeCV.Rdata"))
write_csv(cpm.timecourse.v3.0.log.largeCV,path=file.path("..","output","cpm.timecourse.v3.0.log.largeCV.csv.gz"))
# The following setting is important, do not omit.
library(WGCNA) # errors in installing WGCNA on my computer at impute package installation (Jan 27, 2020). Use Whitney
options(stringsAsFactors = FALSE)
if(Sys.info()["nodename"]=="whitney") {
enableWGCNAThreads(10) # in Whitney (Maloof lab server)
} else if (Sys.info()["nodename"]=="Kazu-MBP.plb.ucdavis.edu") {
enableWGCNAThreads(2) # in my computer
}
#cpm.timecourse.v3.0.log.largeCV<-read_csv(file.path("..","output","cpm.timecourse.v3.0.log.largeCV.csv.gz"))
# for some reasons in Whitney library columns were read ad character. Needs to fix it.
#cpm.timecourse.v3.0.log.largeCV<-read_csv(file.path("..","output","cpm.timecourse.v3.0.log.largeCV.csv.gz"),
# col_types=list(col_character(),col_double())) # error
cpm.timecourse.v3.0.log.largeCV<-read.csv(file.path("..","output","cpm.timecourse.v3.0.log.largeCV.csv.gz")) # using classic read.csv in Whitney
#load(file.path("..","output","cpm.timecourse.v3.0.log.largeCV.Rdata"))
#
datExpr <-t(cpm.timecourse.v3.0.log.largeCV[,-1])
# Choose a set of soft-thresholding powers
powers = c(c(1:9), seq(from = 2, to=20, by=10))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# Plot the results:
#sizeGrWindow(9, 5)
pdf("../output/largeCV.softthresholding.pdf",width=10,height=8)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()
#
net = blockwiseModules(datExpr, power = 9,
TOMType = "unsigned", minModuleSize = 20,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "cpm.timecourse.v3.0.log.largeCV.TOM",
verbose = 3)
save(net,file="../output/net.cpm.timecourse.v3.0.log.largeCV.Rdata")
# open a graphics window
pdf(file="../output/largeCV.dendrogram.pdf",width=10,height=8)
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
# save parameters
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs
geneTree = net$dendrograms[[1]]
save(MEs, moduleLabels, moduleColors, geneTree,file ="../output/all.largeCV.RData")
cpm.timecourse.v3.0.log.largeCV<-read.csv(file.path("..","output","cpm.timecourse.v3.0.log.largeCV.csv.gz"))
dim(cpm.timecourse.v3.0.log.largeCV) # [1] 17262 289 -> [1] 10173 290 (Feb 01, 2020)
## [1] 10173 290
load("../output/net.cpm.timecourse.v3.0.log.largeCV.Rdata")
load("../output/all.largeCV.RData")
# how many modules?
table(net$colors);length(table(net$colors)) # 7 modules
##
## 0 1 2 3 4 5 6
## 4968 4723 174 126 79 72 31
## [1] 7
cpm.timecourse.v3.0.log.largeCV.modules <- tibble(
transcript_ID=cpm.timecourse.v3.0.log.largeCV$transcript_ID,
modules=moduleColors
)
#cpm.timecourse.v3.0.log.largeCV.modules.list<-list(transcript_ID=cpm.timecourse.v3.0.log.largeCV$transcript_ID,modules=moduleColors)
## prep
# annotation file for v3.0annotation
Br.v3.0.At.BLAST <- read_csv(file.path("..","Annotation_copy","output","v3.0annotation","Brapa_v3.0_annotated.csv"))
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## name = col_character(),
## chrom = col_character(),
## subject = col_character(),
## AGI = col_character(),
## At_symbol = col_character(),
## At_full_name = col_character(),
## At_gene_model_type = col_character(),
## At_short_description = col_character(),
## At_Curator_summary = col_character(),
## At_Computational_description = col_character()
## )
## See spec(...) for full column specifications.
# This annotation is redundant with name (Br grene). Eg
Br.v3.0.At.BLAST %>% filter(name=="BraA01g040570.3C")
# reduce the redundancy (112418)
Br.v3.0anno.At.BLAST.highscore <- Br.v3.0.At.BLAST %>% group_by(name) %>% arrange(desc(score)) %>% dplyr::slice(1)
# function for adding annotation
## get object name https://stackoverflow.com/questions/14577412/how-to-convert-variable-object-name-into-string
myfunc <- function(v1) {
deparse(substitute(v1))
}
myfunc(foo)
## [1] "foo"
# adding annotation and write_csv adding ".v3.0anno.csv" to the object name.
addAnno<-function(DGE) {temp<-left_join(DGE %>% rownames_to_column(var="genes"),Br.v3.0anno.At.BLAST.highscore,by=c(genes="name")) %>% dplyr::select(genes,names(DGE),AGI, At_symbol, At_short_description, perc_ID); print(deparse(substitute(DGE)));
write_csv(temp, path=file.path("..","output",paste(deparse(substitute(DGE)),".v3.0anno.csv",sep="")));
return(temp)}
#Br.v3.0anno.At.BLAST.highscore.list<-list()
Bra.v3.0_cdna.list<-list()
#names(Bra.v3.0_cdna.list)<-names(Bra.v3.0_cdna)
names(Bra.v3.0_cdna) %in% cpm.timecourse.v3.0.log.largeCV.modules$transcript_ID
for(i in 1:length(Bra.v3.0_cdna)) {
print(paste("i is ",i))
print(cpm.timecourse.v3.0.log.largeCV.modules %>% dplyr::filter(transcript_ID==as_vector(names(Bra.v3.0_cdna))[i]) %>% dplyr::select(transcript_ID))
print(cpm.timecourse.v3.0.log.largeCV.modules %>% dplyr::filter(transcript_ID==as_vector(names(Bra.v3.0_cdna))[i]) %>% dplyr::select(transcript_ID) %>% dim())
print(cpm.timecourse.v3.0.log.largeCV.modules %>% dplyr::filter(transcript_ID==as_vector(names(Bra.v3.0_cdna))[i]) %>% dplyr::select(transcript_ID) %>% dim() ==c(1,1))
temp<-cpm.timecourse.v3.0.log.largeCV.modules %>% dplyr::filter(transcript_ID==names(Bra.v3.0_cdna)[i]) %>% dplyr::select(transcript_ID)
print(dim(temp)[1]==0)
if(dim(temp)[1]==0) next else
#Bra.v3.0_cdna.list[[i]]<-cpm.timecourse.v3.0.log.largeCV.modules[names(Bra.v3.0_cdna)[i],"modules"]
# input module
Bra.v3.0_cdna.list[[i]]<-cpm.timecourse.v3.0.log.largeCV.modules %>% filter(transcript_ID==names(Bra.v3.0_cdna)[i]) %>% dplyr::select(modules) %>% as_vector()
# iput gene name
names(Bra.v3.0_cdna.list)[[i]]<-names(Bra.v3.0_cdna)[i]
}
# clean up Brgo.v3.0_cdna.list
table(sapply(Bra.v3.0_cdna.list,is.null))
Bra.v3.0_cdna.list<-Bra.v3.0_cdna.list[!sapply(Bra.v3.0_cdna.list,is.null)]
table(sapply(Bra.v3.0_cdna.list,is.null))
save(Bra.v3.0_cdna.list,file="../output/Bra.v3.0_cdna.list.Rdata")
######### Did not work
# cpm.timecourse.v3.0.log.largeCV.modules %>% nest(transcript_ID) # this is not what I want
# library(purrr)
#cpm.timecourse.v3.0.log.largeCV.modules %>% purrr::transpose()
# loading module info as custom categories compatible with goseq()
load("../output/Bra.v3.0_cdna.list.Rdata")
# GOseq
library(ShortRead);library(goseq);library(GO.db);library("annotate")
# for ggplot heatmap
## uncompress gz file
system(paste("gunzip -c ",file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.gz")," > ",file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.fa")))
## read cDNA fasta file
Bra.v3.0_cdna<-readDNAStringSet(file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.fa")) # copied from /Volumes/data_work/Data8/NGS_related/Brassica_rapa_Upendra/G3
Bra.v3.0_cdna
## remove fasta file
system(paste("rm ",file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.fa"),sep=""))
# special funciton for GOseq
GOseq.customcategory.ORA<-function(genelist,padjust=0.05,custom.category.list=Bra.v3.0_cdna.list,Br_cdna=Bra.v3.0_cdna) { # return GO enrichment table, padjus, padjust=0.05.
bias<-nchar(Br_cdna)
names(bias)<-names(Br_cdna)
TF<-(names(bias) %in% genelist)*1
names(TF)<-names(bias)
#print(TF)
pwf<-nullp(TF,bias.data=bias)
#print(pwf$DEgenes)
GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
#GO.pval <- goseq(pwf,gene2cat=Brgo.DF3,use_genes_without_cat=TRUE) # format became different in new goseq version (021111)
GO.pval$over_represented_padjust<-p.adjust(GO.pval$over_represented_pvalue,method="BH")
if(GO.pval$over_represented_padjust[1]>padjust) return("no enriched GO")
else {
enriched.GO<-GO.pval[GO.pval$over_represented_padjust<padjust,]
print("enriched.GO is")
print(enriched.GO)
return(enriched.GO)
}
}
gene.up<-twoafternoon.trtlive.DEGs.all.v3.0anno %>% filter(logFC>0&FDR<0.05) %>% dplyr::select(genes) %>% as_vector()
gene.down<-twoafternoon.trtlive.DEGs.all.v3.0anno %>% filter(logFC<0&FDR<0.05) %>% dplyr::select(genes) %>% as_vector()
enriched.GO.up<-GOseq.customcategory.ORA(genelist=gene.up) # needs to wait for Bra.v3.0_cdna.list.Rdata ready in Whitney
## Error in GOseq.customcategory.ORA(genelist = gene.up): could not find function "GOseq.customcategory.ORA"
enriched.GO.down<-GOseq.customcategory.ORA(genelist=gene.down)
## Error in GOseq.customcategory.ORA(genelist = gene.down): could not find function "GOseq.customcategory.ORA"
n<-1
gene.up.category<-cpm.timecourse.v3.0.log.largeCV.modules %>% filter(transcript_ID %in% gene.up,modules==enriched.GO.up$category[n])
## Error in ~modules == enriched.GO.up$category[n]: object 'enriched.GO.up' not found
expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2(target.genes=gene.up.category[1:10,]) # works
## Error in transcript_ID %in% target.genes$transcript_ID: object 'gene.up.category' not found
# scaling expression data
cpm.timecourse.v3.0.scale<-t(scale(t(cpm.timecourse.v3.0[,-1]))) %>% as_tibble() %>% bind_cols(data.frame(transcript_ID=cpm.timecourse.v3.0$transcript_ID[]),.)
gene.up.category<-cpm.timecourse.v3.0.log.largeCV.modules %>% filter(transcript_ID %in% gene.up,modules==enriched.GO.up$category[n])
## Error in ~modules == enriched.GO.up$category[n]: object 'enriched.GO.up' not found
gene.down.category<-cpm.timecourse.v3.0.log.largeCV.modules %>% filter(transcript_ID %in% gene.down,modules==enriched.GO.up$category[n])
## Error in ~modules == enriched.GO.up$category[n]: object 'enriched.GO.up' not found
expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2(data=cpm.timecourse.v3.0.scale,target.genes=gene.up.category[1,])
## Error in transcript_ID %in% target.genes$transcript_ID: object 'gene.up.category' not found
input<-tribble(
~target.genes,~data,~f,
gene.up.category[1:10,],cpm.timecourse.v3.0.scale,expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2,
gene.down.category[1:10,],cpm.timecourse.v3.0.scale,expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2
)
input2<-tribble(
~f,~param,
expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2,list(target.genes=gene.up.category[1:10,],data=cpm.timecourse.v3.0.scale),
expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2,list(target.genes=gene.down.category[1:10,],data=cpm.timecourse.v3.0.scale)
)
test<-input2 %>% mutate(output=invoke_map(f,param)) # works, but parameters are not visible
# how about to use map2?
## an example
params<-tribble(
~mean,~sd,~n,
5,1,1,
10,5,3,
-3,10,5
)
params %>% pmap(rnorm)
#
input3<-tribble(
~target.genes,~data,~title,
gene.up.category[1:10,],cpm.timecourse.v3.0.scale,"2-afternoon soil up",
gene.down.category[1:10,],cpm.timecourse.v3.0.scale,"2-afternoon soil down",
)
#input3 %>% pmap(expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2) -> expression.pattern
#
expression.pattern <- input3 %>% mutate(plot=pmap(.,expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2))
expression.pattern$plot[1] # plot
#input3 %>% mutate(plot=invoke_map(~expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2)) # errors
temp.abs<-cpm.timecourse.v3.0.log %>% head() %>%
gather(sample,value,-transcript_ID) %>%
mutate(abs.value=2^value) %>%
inner_join(sample.description.timecourse, by="sample") %>%
split(.$soil_trt)
mean.by.soil<-function(x) {group_by(x, group,transcript_ID) %>% summarize(mean=mean(abs.value))}
temp.abs.mean<-temp.abs %>% map(.,mean.by.soil)
cpm.timecourse.v3.0.logFC<-tibble(transcript_ID=temp.abs.mean[["SBC_OLD"]]$transcript_ID,group=temp.abs.mean[["SBC_OLD"]]$group,logFC=log(temp.abs.mean[["SBC_OLD"]]$mean/temp.abs.mean[["ATM_BLANK"]]$mean)) %>% left_join(sample.description.timecourse %>% dplyr::select("group","sampling_day","sampling_time"),by="group")
# check logFC range
range(cpm.timecourse.v3.0.logFC$logFC) #(Jan 31, 2020)
temp.abs<-cpm.timecourse.v3.0.log %>%
gather(sample,value,-transcript_ID) %>% mutate(abs.value=2^value) %>%
inner_join(sample.description.timecourse, by="sample") %>%
split(.$soil_trt)
# mean of absolute value funciton
mean.by.soil<-function(x) {group_by(x, group,transcript_ID) %>% summarize(mean=mean(abs.value))}
# calculating absolute value mean
temp.abs.mean<-temp.abs %>% map(.,mean.by.soil)
# making summary tibble
temp2<-sample.description.timecourse %>% dplyr::select("group","sampling_day","sampling_time")
sample.description.timecourse.logFC<-temp2[!duplicated(temp2),]
# add sample info
cpm.timecourse.v3.0.logFC<-tibble(transcript_ID=temp.abs.mean[["SBC_OLD"]]$transcript_ID,group=temp.abs.mean[["SBC_OLD"]]$group,logFC=log(temp.abs.mean[["SBC_OLD"]]$mean/temp.abs.mean[["ATM_BLANK"]]$mean)) %>% left_join(sample.description.timecourse.logFC,by="group")
# check
dim(cpm.timecourse.v3.0.logFC)
# check frequency distribution
a<-hist(cpm.timecourse.v3.0.logFC$logFC) # most of them are small
a
# what are genes with super high logFC?
high.FC.genes<-cpm.timecourse.v3.0.logFC %>% filter(abs(logFC)>5) %>% dplyr::select(transcript_ID)
expression.pattern.Br.graph.timecourse.v3.0annotation.cpm.2(target.genes = high.FC.genes[1:10,])
#
addAnno2<-function(DGE) {temp<-left_join(DGE,Br.v3.0anno.At.BLAST.highscore,by=c("transcript_ID"="name")) %>% dplyr::select(transcript_ID,names(DGE),AGI, At_symbol, At_short_description, perc_ID); print(deparse(substitute(DGE)));
write_csv(temp, path=file.path("..","output",paste(deparse(substitute(DGE)),".v3.0anno.csv",sep="")));
return(temp)}
#
addAnno2(high.FC.genes)
#
dim(cpm.timecourse.v3.0.logFC) #[1] 1110000 5
#write_csv(cpm.timecourse.v3.0.logFC,path="../output/cpm.timecourse.v3.0.logFC.csv") # too large (306 M)
write_csv(cpm.timecourse.v3.0.logFC,path="../output/cpm.timecourse.v3.0.logFC.csv.gz") # 12.3 M
cpm.timecourse.v3.0.logFC <-read_csv("../output/cpm.timecourse.v3.0.logFC.csv.gz")
## Parsed with column specification:
## cols(
## transcript_ID = col_character(),
## group = col_character(),
## logFC = col_double(),
## sampling_day = col_character(),
## sampling_time = col_character()
## )
target.genes<-gene.up
# expression.pattern.Br.graph.timecourse.v3.0annotation.logFC<-function(data=cpm.timecourse.v3.0.logFC,target.genes,title="",subset.data="only_two_afternoon"){
# #print(paste("data is",data[1:10,]))
# #print(paste("tissue.type is root"))
# data[is.na(data)] <- 0 #
# data.temp<-data %>% dplyr::filter(transcript_ID %in% target.genes)
#
# # if (2-afternoon=TRUE)
# if (subset.data=="only_two_afternoon") {
# p<-data.temp %>% ggplot(aes(x=sampling_day,y=logFC)) +
# geom_boxplot(alpha = 0.5) +
# theme_bw() +
# theme(strip.text.y=element_text(angle=0),axis.text.x=element_text(angle=90)) +
# theme(legend.position="bottom") + labs(title=title)
# p
# } else {print("Define subset.data other than only_two_afternoon.")}
# }
# test the function
expression.pattern.Br.graph.timecourse.v3.0annotation.logFC(target.genes=gene.up,subset.data="only_two_afternoon")
# 2_afternoon DEG expression data (scaled)
cpm.timecourse.v3.0.scale.twoafternoon.DEG<-cpm.timecourse.v3.0.scale %>% dplyr::select(-cv) %>%
inner_join(twoafternoon.trtlive.DEGs.all.v3.0anno %>% filter(FDR<0.05) %>% dplyr::select(genes),by=c(transcript_ID="genes")) %>%
gather(sample,value,-1) %>% inner_join(sample.description.timecourse,by="sample") %>% filter(sampling_time=="2_afternoon")
## Warning: Column `transcript_ID`/`genes` joining factor and character vector,
## coercing into character vector
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# spread
cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread<-cpm.timecourse.v3.0.scale.twoafternoon.DEG %>% dplyr::select(transcript_ID,sample,value) %>% spread(sample,value,-1)
dim(cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread) # [1] 1442 97
## [1] 1442 97
# calculate wss
wss <- (nrow(cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1])-1)*sum(apply(cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1],2,var))
for (i in 2:20) wss[i] <- sum(kmeans(cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1],
centers=i,iter.max = 10)$withinss) # If default iter.max=10 gave me "did not converge in 10 iterations" error. Solution: https://r.789695.n4.nabble.com/kmeans-quot-did-not-converge-in-10-iterations-quot-td797019.html.
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
Let’s perform the actual clsutering using K=8:
set.seed(20)
kClust.8 <- kmeans(cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1], centers=8, nstart = 1000, iter.max = 20)
kClusters.8 <- kClust.8$cluster
# number of clusters
cluster.8.num<-tibble(cluster=kClusters.8) %>% group_by(cluster) %>% summarize(n=sum(cluster))
cluster.8.num$cluster<-as.character(cluster.8.num$cluster) # classic way
Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster i
clust.centroid = function(i, dat, clusters) {
ind = (clusters == i)
colMeans(dat[ind,])
}
kClustcentroids.8 <- sapply(levels(factor(kClusters.8)), clust.centroid, cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1], kClusters.8)
# adding sample description to data
data.sample<-kClustcentroids.8 %>% as_tibble(rownames="sample") %>%
gather(cluster,value,-1) %>%
inner_join(sample.description.timecourse,by="sample") %>%
inner_join(cluster.8.num,by="cluster") %>%
mutate(cluster.n=glue('{cluster2} \n({n2})',
cluster2=cluster,
n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
data.group<-data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% group_by(group.cluster) %>% summarize(group.cluster.mean=mean(value)) %>% inner_join(
data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% dplyr::select("group.cluster","sampling_day","soil_trt","cluster.n","cluster"),by="group.cluster") %>% dplyr::slice(rep(1:768)[!duplicated(.$group.cluster)])
# plot
p8<- ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster))) +
geom_point() + geom_hline(yintercept=0,color="red") +
geom_line(data=data.group,aes(x=soil_trt,y=group.cluster.mean)) +
facet_grid(cluster.n~sampling_day) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): eight clusters",color = "Cluster",y="scaled expression level")
p8
ggsave(p8,file="../output/Twoafternoon.DEG.Kmean.8clusters.png",width=11,height=8)
set.seed(20)
kClust.5 <- kmeans(cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1], centers=5, nstart = 1000, iter.max = 20)
kClusters.5 <- kClust.5$cluster
# number of clusters
cluster.5.num<-tibble(cluster=kClusters.5) %>% group_by(cluster) %>% summarize(n=sum(cluster))
cluster.5.num$cluster<-as.character(cluster.5.num$cluster) # classic way
kClustcentroids.5 <- sapply(levels(factor(kClusters.5)), clust.centroid, cpm.timecourse.v3.0.scale.twoafternoon.DEG.spread[,-1], kClusters.5)
# adding sample description to data
data.sample<-kClustcentroids.5 %>% as_tibble(rownames="sample") %>%
gather(cluster,value,-1) %>%
inner_join(sample.description.timecourse,by="sample") %>%
inner_join(cluster.5.num,by="cluster") %>%
mutate(cluster.n=glue('{cluster2} \n({n2})',
cluster2=cluster,
n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
data.group<-data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% group_by(group.cluster) %>% summarize(group.cluster.mean=mean(value)) %>% inner_join(
data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% dplyr::select("group.cluster","sampling_day","soil_trt","cluster.n","cluster"),by="group.cluster") %>% dplyr::slice(rep(1:480)[!duplicated(.$group.cluster)]) # only cluster 1... why???
# plot
p5<- ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster))) +
geom_point() + geom_hline(yintercept=0,color="red") +
geom_line(data=data.group,aes(x=soil_trt,y=group.cluster.mean)) +
facet_grid(cluster.n~sampling_day) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): five clusters",color = "Cluster",y="scaled expression level")
p5
ggsave(p5,file="../output/Twoafternoon.DEG.Kmean.5clusters.png",width=11,height=8)
expression.pattern.Br.graph.timecourse.v3.0annotation.logFC(target.genes=gene.up,subset.data="only_two_afternoon")
# 2_afternoon DEG expression data (scaled)
cpm.timecourse.v3.0.logFC.twoafternoon.DEG<-cpm.timecourse.v3.0.logFC %>%
inner_join(twoafternoon.trtlive.DEGs.all.v3.0anno %>% filter(FDR<0.05) %>% dplyr::select(genes),by=c(transcript_ID="genes")) %>% filter(sampling_time=="2_afternoon")
# spread
cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread<-cpm.timecourse.v3.0.logFC.twoafternoon.DEG %>% dplyr::select(transcript_ID,group,logFC) %>% spread(group,logFC,-1)
dim(cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread) # [1] 1474 97
## [1] 1442 9
# calculate wss
wss.logFC <- (nrow(cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1])-1)*sum(apply(cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1],2,var))
for (i in 2:20) wss.logFC[i] <- sum(kmeans(cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1],
centers=i,iter.max = 20)$withinss) # If default iter.max=10 gave me "did not converge in 10 iterations" error. Solution: https://r.789695.n4.nabble.com/kmeans-quot-did-not-converge-in-10-iterations-quot-td797019.html.
plot(1:20, wss.logFC, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
# Let’s perform the actual clsutering using K=5:
set.seed(20)
kClust.logFC.5 <- kmeans(cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1], centers=5, nstart = 1000, iter.max = 20)
kClusters.logFC.5 <- kClust.logFC.5$cluster
# number of clusters
cluster.5.num<-tibble(cluster=kClusters.logFC.5) %>% group_by(cluster) %>% summarize(n=sum(cluster))
cluster.5.num$cluster<-as.character(cluster.5.num$cluster) # classic way
kClustcentroids.logFC.5 <- sapply(levels(factor(kClusters.logFC.5)), clust.centroid, cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1], kClusters.logFC.5)
# making sample.description.timecourse.logFC
temp2<-sample.description.timecourse %>% dplyr::select("group","sampling_day","sampling_time")
sample.description.timecourse.logFC<-temp2[!duplicated(temp2),]
# plot
p.logFC.5<-kClustcentroids.logFC.5 %>% as_tibble(rownames="group") %>%
gather(cluster,value,-1) %>%
inner_join(sample.description.timecourse.logFC,by="group") %>%
inner_join(cluster.5.num,by="cluster") %>%
mutate(cluster.n=glue('{cluster2} \n({n2})',
cluster2=cluster,
n2=n) ) %>%
ggplot(aes(x=sampling_day,y=value, group=cluster, colour=as.factor(cluster))) +
geom_point() + geom_hline(yintercept=0,color="red") +
facet_grid(cluster.n~.) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): five clusters",color = "Cluster",y="scaled expression level")
## Warning: Column `group` joining character vector and factor, coercing into
## character vector
p.logFC.5
ggsave(p.logFC.5,file="../output/Twoafternoon.DEG.logFC.Kmean.5clusters.png",width=11,height=8)
set.seed(20)
kClust.logFC.8 <- kmeans(cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1], centers=8, nstart = 1000, iter.max = 20)
kClusters.logFC.8 <- kClust.logFC.8$cluster
# number of clusters
cluster.8.num<-tibble(cluster=kClusters.logFC.8) %>% group_by(cluster) %>% summarize(n=sum(cluster))
cluster.8.num$cluster<-as.character(cluster.8.num$cluster) # classic way
kClustcentroids.logFC.8 <- sapply(levels(factor(kClusters.logFC.8)), clust.centroid, cpm.timecourse.v3.0.logFC.twoafternoon.DEG.spread[,-1], kClusters.logFC.8)
p.logFC.8<-kClustcentroids.logFC.8 %>% as_tibble(rownames="group") %>%
gather(cluster,value,-1) %>%
inner_join(sample.description.timecourse.logFC,by="group") %>%
inner_join(cluster.8.num,by="cluster") %>%
mutate(cluster.n=glue('{cluster2} \n({n2})',
cluster2=cluster,
n2=n) ) %>%
ggplot(aes(x=sampling_day,y=value, group=cluster, colour=as.factor(cluster))) +
geom_point() + geom_hline(yintercept=0,color="red") +
facet_grid(cluster.n~.) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): eight clusters",color = "Cluster",y="scaled expression level")
## Warning: Column `group` joining character vector and factor, coercing into
## character vector
p.logFC.8
ggsave(p.logFC.8,file="../output/Twoafternoon.DEG.logFC.Kmean.8clusters.png",width=11,height=8)
load(file.path("..","Annotation_copy","output","v3.0annotation","Brgo.v3.0anno.Atgoslim.BP.list.Rdata"))
# GOseq
library(ShortRead);library(goseq);library(GO.db);library("annotate")
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following object is masked from 'package:limma':
##
## plotMA
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Loading required package: BiocParallel
## Loading required package: Biostrings
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:glue':
##
## collapse, trim
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## Loading required package: XVector
##
## Attaching package: 'XVector'
## The following object is masked from 'package:purrr':
##
## compact
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## Loading required package: Rsamtools
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GenomicAlignments
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:purrr':
##
## simplify
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
##
## Attaching package: 'GenomicAlignments'
## The following object is masked from 'package:dplyr':
##
## last
##
## Attaching package: 'ShortRead'
## The following object is masked from 'package:dplyr':
##
## id
## The following object is masked from 'package:purrr':
##
## compose
## The following object is masked from 'package:tibble':
##
## view
## Loading required package: BiasedUrn
## Loading required package: geneLenDataBase
##
## Attaching package: 'geneLenDataBase'
## The following object is masked from 'package:S4Vectors':
##
## unfactor
##
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: XML
# for ggplot heatmap
## uncompress gz file
system(paste("gunzip -c ",file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.gz")," > ",file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.fa")))
## read cDNA fasta file
Bra.v3.0_cdna<-readDNAStringSet(file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.fa")) # copied from /Volumes/data_work/Data8/NGS_related/Brassica_rapa_Upendra/G3
Bra.v3.0_cdna
## A DNAStringSet instance of length 46250
## width seq names
## [1] 1254 ATGCGACCACCGGGTGTTGTT...GAGTCTCTCTTGCTCGCTTAA BraA01g000010.3C
## [2] 1668 ATGCCAGCAATGCATGCCGTT...AGATGGATCACAAAAGATTAA BraA01g000020.3C
## [3] 957 ATGATGCTTCTCGTTCATACC...AACTTGGAGTTCCCTGAGTGA BraA01g000030.3C
## [4] 1299 ATGAGTCGTCTTCTCCTTGCT...GGGTCACGAGATGAGCTATAA BraA01g000040.3C
## [5] 774 ATGGATTCTGGGCTTCAGCAT...GGAAAGCAGTTCCTTTCGTGA BraA01g000050.3C
## ... ... ...
## [46246] 162 ATGCGTCCGTCCTCAGCTCCC...TCTTTGGTGGTCCGGTTCTAA BraAnng001840.3C
## [46247] 1455 ATGTCTAATCAAGGATCAGGA...ACAGGTTTGTTTAGGTGCTAA BraAnng001850.3C
## [46248] 1011 ATGGACAACGTAATTCTGAAA...TCAGGGAAGAAAAGCCCCTGA BraAnng006150.3C
## [46249] 870 ATGTTTCCAAGACGTACAAGG...AGCAGTTGTCCTTATAGTTAG BraAnng000040.3C
## [46250] 1338 ATGCCGCAACAATACTGGAAC...GGAGAGAACCTTATCTCCTGA BraAnng003440.3C
## remove fasta file
system(paste("rm ",file.path("..","Annotation_copy","input","v3.0annotation","Brapa_genome_v3.0_cds.fa"),sep=""))
# GOseq function
GOseq.Brgo.v3.0.Atgoslim.BP.list.ORA<-function(genelist,padjust=0.05,ontology="BP",custom.category.list=Brgo.v3.0anno.Atgoslim.BP.list,Br_cdna=Bra.v3.0_cdna) { # return GO enrichment table, padjus, padjust=0.05.
bias<-nchar(Br_cdna)
names(bias)<-names(Br_cdna)
TF<-(names(bias) %in% genelist)*1
names(TF)<-names(bias)
#print(TF)
pwf<-nullp(TF,bias.data=bias)
#print(pwf$DEgenes)
GO.pval <- goseq(pwf,gene2cat=custom.category.list,use_genes_without_cat=TRUE) # format became different in new goseq version (021111). Does not work (042716)
#GO.pval <- goseq(pwf,gene2cat=Brgo.DF3,use_genes_without_cat=TRUE) # format became different in new goseq version (021111)
#head(GO.pval)
if(ontology=="BP") {
GO.pval2<-subset(GO.pval,ontology=="BP")
} else if(ontology=="CC") {
GO.pval2<-subset(GO.pval,ontology=="CC")
} else {
GO.pval2<-subset(GO.pval,ontology=="MF")
}
GO.pval2$over_represented_padjust<-p.adjust(GO.pval2$over_represented_pvalue,method="BH")
if(GO.pval2$over_represented_padjust[1]>padjust) return("no enriched GO")
else {
enriched.GO<-GO.pval2[GO.pval2$over_represented_padjust<padjust,]
print("enriched.GO is")
print(enriched.GO)
## write Term and Definition
for(i in 1:dim(enriched.GO)[1]) {
if(is.null(Term(GOTERM[enriched.GO[i,"category"]]))) {next} else {
enriched.GO$Term[i]<-Term(GOTERM[[enriched.GO[i,"category"]]])
enriched.GO$Definition[i]<-Definition(GOTERM[[enriched.GO[i,"category"]]])
}
}
return(enriched.GO)
}
}
#
head(Bra.v3.0_cdna)
## A DNAStringSet instance of length 6
## width seq names
## [1] 1254 ATGCGACCACCGGGTGTTGTTTC...CTGAGTCTCTCTTGCTCGCTTAA BraA01g000010.3C
## [2] 1668 ATGCCAGCAATGCATGCCGTTTT...GTAGATGGATCACAAAAGATTAA BraA01g000020.3C
## [3] 957 ATGATGCTTCTCGTTCATACCCG...GGAACTTGGAGTTCCCTGAGTGA BraA01g000030.3C
## [4] 1299 ATGAGTCGTCTTCTCCTTGCTCA...GTGGGTCACGAGATGAGCTATAA BraA01g000040.3C
## [5] 774 ATGGATTCTGGGCTTCAGCATCT...AAGGAAAGCAGTTCCTTTCGTGA BraA01g000050.3C
## [6] 3327 ATGGCGTCCACTCCTCCTCAAAA...GCGGTGGGTTTCAATTTCCTTGA BraA01g000060.3C
# length(bias) # 44239 > 45019 where the bias come from?
# bias.data vector must have the same length as DEgenes vector!
# 2_afternoon DEG expression data (scaled)
cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG<-cpm.timecourse.v3.0.scale %>% dplyr::select(-cv) %>%
inner_join(twoafternoon.any.trtlive.DEGs.all.v3.0anno %>% filter(FDR<0.05) %>% dplyr::select(genes),by=c(transcript_ID="genes")) %>%
gather(sample,value,-1) %>% inner_join(sample.description.timecourse,by="sample") %>% filter(sampling_time=="2_afternoon")
## Warning: Column `transcript_ID`/`genes` joining factor and character vector,
## coercing into character vector
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# spread
cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread<-cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG %>% dplyr::select(transcript_ID,sample,value) %>% spread(sample,value,-1)
dim(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread) # [1] 2178 97
## [1] 2178 97
# calculate wss
wss <- (nrow(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1])-1)*sum(apply(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1],2,var))
for (i in 2:20) wss[i] <- sum(kmeans(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1],
centers=i,iter.max = 10)$withinss) # If default iter.max=10 gave me "did not converge in 10 iterations" error. Solution: https://r.789695.n4.nabble.com/kmeans-quot-did-not-converge-in-10-iterations-quot-td797019.html.
plot(1:20, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
Let’s perform the actual clsutering using K=5:
set.seed(20)
kClust.any.trtlive.5 <- kmeans(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], centers=5, nstart = 1000, iter.max = 20)
kClusters.any.trtlive.5 <- kClust.any.trtlive.5$cluster
# number of clusters
cluster.any.trtlive.5.num<-tibble(cluster=kClusters.any.trtlive.5) %>% group_by(cluster) %>% summarize(n=n())
cluster.any.trtlive.5.num$cluster<-as.character(cluster.any.trtlive.5.num$cluster) # classic way
cluster.any.trtlive.5.num
Now we can calculate the cluster ‘cores’ aka centroids: # find centroid in cluster
kClustcentroids.any.trtlive.5 <- sapply(levels(factor(kClusters.any.trtlive.5)), clust.centroid, cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], kClusters.any.trtlive.5)
kClustcentroids.any.trtlive.5 %>% head()
## 1 2 3 4
## 1a1_q_002_S1_R1_001 -0.370624345 -0.27016316 -0.781291105 -0.09145448
## 1a3_q_004_S3_R1_001 0.586129826 -0.45516870 -0.082928903 -0.31258537
## 1a7_q_007_d8_S7_R1_001 0.603390134 -0.06622119 -0.118731825 -0.06524704
## 1a8_q_008_d8_S8_R1_001 0.513232164 -0.44547237 -0.347272642 -0.23834981
## 1c6_q_028_S22_R1_001 -0.008042207 -0.39487064 0.006183179 -0.18385917
## 1d3_q_038_S27_R1_001 0.423676033 -0.18654295 -0.166417100 -0.15567393
## 5
## 1a1_q_002_S1_R1_001 0.48593350
## 1a3_q_004_S3_R1_001 -0.04775671
## 1a7_q_007_d8_S7_R1_001 -0.02718576
## 1a8_q_008_d8_S8_R1_001 0.19646882
## 1c6_q_028_S22_R1_001 0.18640730
## 1d3_q_038_S27_R1_001 0.01866725
# adding sample description to data
data.sample<-kClustcentroids.any.trtlive.5 %>% as_tibble(rownames="sample") %>%
gather(cluster,value,-1) %>%
inner_join(sample.description.timecourse,by="sample") %>%
inner_join(cluster.any.trtlive.5.num,by="cluster") %>%
mutate(cluster.n=glue('{cluster2} \n({n2})',
cluster2=cluster,
n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
### under construction ####
data.group<-data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% group_by(group.cluster) %>% summarize(group.cluster.mean=mean(value)) %>% inner_join(
data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% dplyr::select("group.cluster","sampling_day","soil_trt","cluster.n","cluster"),by="group.cluster") %>% dplyr::slice(rep(1:768)[!duplicated(.$group.cluster)])
# plot
p5.any.trtlive<- ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster))) +
geom_point() + geom_hline(yintercept=0,color="red") +
geom_line(data=data.group,aes(x=soil_trt,y=group.cluster.mean)) +
facet_grid(cluster.n~sampling_day) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): five clusters",color = "Cluster",y="scaled expression level")
p5.any.trtlive
ggsave(p5.any.trtlive,file="../output/Twoafternoon.any.trtlive.DEG.Kmean.5clusters.png",width=11,height=6)
Let’s perform the actual clsutering using K=6:
set.seed(20)
kClust.any.trtlive.6 <- kmeans(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], centers=6, nstart = 1000, iter.max = 20)
kClusters.any.trtlive.6 <- kClust.any.trtlive.6$cluster
# number of clusters
cluster.any.trtlive.6.num<-tibble(cluster=kClusters.any.trtlive.6) %>% group_by(cluster) %>% summarize(n=n())
cluster.any.trtlive.6.num$cluster<-as.character(cluster.any.trtlive.6.num$cluster) # classic way
cluster.any.trtlive.6.num
Now we can calculate the cluster ‘cores’ aka centroids: # find centroid in cluster
kClustcentroids.any.trtlive.6 <- sapply(levels(factor(kClusters.any.trtlive.6)), clust.centroid, cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], kClusters.any.trtlive.6)
kClustcentroids.any.trtlive.6 %>% head()
## 1 2 3 4
## 1a1_q_002_S1_R1_001 0.65133082 -0.3158853 -0.8226438 -0.37371653
## 1a3_q_004_S3_R1_001 -0.08768173 -0.5162655 -0.1554494 0.63258565
## 1a7_q_007_d8_S7_R1_001 -0.06407782 -0.0306091 -0.2095253 0.68120007
## 1a8_q_008_d8_S8_R1_001 0.31357964 -0.5018963 -0.3659202 0.54455777
## 1c6_q_028_S22_R1_001 0.07730847 -0.4843704 0.1434211 -0.03188877
## 1d3_q_038_S27_R1_001 -0.08124684 -0.1882003 -0.2357184 0.49835252
## 5 6
## 1a1_q_002_S1_R1_001 -0.05820563 -0.23794070
## 1a3_q_004_S3_R1_001 -0.22758805 -0.22464750
## 1a7_q_007_d8_S7_R1_001 -0.07040824 -0.03305671
## 1a8_q_008_d8_S8_R1_001 -0.16841988 -0.30044683
## 1c6_q_028_S22_R1_001 -0.12421671 -0.15052035
## 1d3_q_038_S27_R1_001 -0.14962567 -0.05877720
# adding sample description to data
data.sample<-kClustcentroids.any.trtlive.6 %>% as_tibble(rownames="sample") %>%
gather(cluster,value,-1) %>%
inner_join(sample.description.timecourse,by="sample") %>%
inner_join(cluster.any.trtlive.6.num,by="cluster") %>%
mutate(cluster.n=glue('{cluster2} \n({n2})',
cluster2=cluster,
n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
### under construction ####
data.group<-data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% group_by(group.cluster) %>% summarize(group.cluster.mean=mean(value)) %>% inner_join(
data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% dplyr::select("group.cluster","sampling_day","soil_trt","cluster.n","cluster"),by="group.cluster") %>% dplyr::slice(rep(1:768)[!duplicated(.$group.cluster)])
# plot
p6.any.trtlive<- ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster))) +
geom_point() + geom_hline(yintercept=0,color="red") +
geom_line(data=data.group,aes(x=soil_trt,y=group.cluster.mean)) +
facet_grid(cluster.n~sampling_day) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): six clusters",color = "Cluster",y="scaled expression level")
p6.any.trtlive
ggsave(p6.any.trtlive,file="../output/Twoafternoon.any.trtlive.DEG.Kmean.6clusters.png",width=11,height=6)
Let’s perform the actual clsutering using K=8:
set.seed(20)
kClust.any.trtlive.8 <- kmeans(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], centers=8, nstart = 1000, iter.max = 20)
kClusters.any.trtlive.8 <- kClust.any.trtlive.8$cluster
# number of clusters
cluster.any.trtlive.8.num<-tibble(cluster=kClusters.any.trtlive.8) %>% group_by(cluster) %>% summarize(n=n())
cluster.any.trtlive.8.num$cluster<-as.character(cluster.any.trtlive.8.num$cluster) # classic way
cluster.any.trtlive.8.num
Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster
clust.centroid = function(i, dat, clusters) {
ind = (clusters == i)
colMeans(dat[ind,])
}
kClustcentroids.any.trtlive.8 <- sapply(levels(factor(kClusters.any.trtlive.8)), clust.centroid, cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], kClusters.any.trtlive.8)
kClustcentroids.any.trtlive.8 %>% head()
## 1 2 3 4
## 1a1_q_002_S1_R1_001 0.66774332 -0.32765717 -0.04664319 -0.3967766
## 1a3_q_004_S3_R1_001 -0.16465433 -0.35598558 -0.23130395 -0.6495724
## 1a7_q_007_d8_S7_R1_001 -0.09954455 0.07985282 -0.05066739 -0.2320192
## 1a8_q_008_d8_S8_R1_001 0.29726762 -0.41141910 -0.16242202 -0.6164058
## 1c6_q_028_S22_R1_001 0.05698738 -0.44714663 -0.12673854 -0.4659240
## 1d3_q_038_S27_R1_001 -0.06494311 -0.01126279 -0.13568017 -0.4660710
## 5 6 7 8
## 1a1_q_002_S1_R1_001 -0.15395817 -0.9249212 -0.35478101 -0.584426385
## 1a3_q_004_S3_R1_001 -0.27426807 -0.4472171 0.58951253 0.598921297
## 1a7_q_007_d8_S7_R1_001 -0.01876720 -0.3831983 0.72398958 0.051023263
## 1a8_q_008_d8_S8_R1_001 -0.28330120 -0.4472883 0.55642867 -0.032406247
## 1c6_q_028_S22_R1_001 -0.11632418 0.3594808 -0.04093214 0.001411183
## 1d3_q_038_S27_R1_001 -0.01812421 -0.2720918 0.51892604 -0.120575363
# adding sample description to data
data.sample<-kClustcentroids.any.trtlive.8 %>% as_tibble(rownames="sample") %>%
gather(cluster,value,-1) %>%
inner_join(sample.description.timecourse,by="sample") %>%
inner_join(cluster.any.trtlive.8.num,by="cluster") %>%
mutate(cluster.n=glue('{cluster2} \n({n2})',
cluster2=cluster,
n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
data.group<-data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% group_by(group.cluster) %>% summarize(group.cluster.mean=mean(value)) %>% inner_join(
data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% dplyr::select("group.cluster","sampling_day","soil_trt","cluster.n","cluster"),by="group.cluster") %>% dplyr::slice(rep(1:768)[!duplicated(.$group.cluster)])
# plot
p8.any.trtlive<- ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster))) +
geom_point() + geom_hline(yintercept=0,color="red") +
geom_line(data=data.group,aes(x=soil_trt,y=group.cluster.mean)) +
facet_grid(cluster.n~sampling_day) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): eight clusters",color = "Cluster",y="scaled expression level")
p8.any.trtlive
ggsave(p8.any.trtlive,file="../output/Twoafternoon.any.trtlive.DEG.Kmean.8clusters.png",width=11,height=8)
Let’s perform the actual clsutering using K=15:
set.seed(20)
kClust.any.trtlive.15 <- kmeans(cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], centers=15, nstart = 1000, iter.max = 20)
kClusters.any.trtlive.15 <- kClust.any.trtlive.15$cluster
# number of clusters
cluster.any.trtlive.15.num<-tibble(cluster=kClusters.any.trtlive.15) %>% group_by(cluster) %>% summarize(n=n())
cluster.any.trtlive.15.num$cluster<-as.character(cluster.any.trtlive.15.num$cluster) # classic way
cluster.any.trtlive.15.num
Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster i
clust.centroid = function(i, dat, clusters) {
ind = (clusters == i)
colMeans(dat[ind,])
}
kClustcentroids.any.trtlive.15 <- sapply(levels(factor(kClusters.any.trtlive.15)), clust.centroid, cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread[,-1], kClusters.any.trtlive.15)
kClustcentroids.any.trtlive.15 %>% head()
## 1 2 3 4
## 1a1_q_002_S1_R1_001 0.5363413 -0.368342751 -0.1930865 -0.9943538
## 1a3_q_004_S3_R1_001 -0.7294333 -0.364642600 -0.7338475 -0.4490430
## 1a7_q_007_d8_S7_R1_001 -0.3427135 -0.002591793 -0.1843309 -0.4124467
## 1a8_q_008_d8_S8_R1_001 -0.2509873 -0.479632975 -0.5871281 -0.4363606
## 1c6_q_028_S22_R1_001 -0.2064792 -0.519653925 -0.4785148 0.4240465
## 1d3_q_038_S27_R1_001 -0.4523775 -0.096021258 -0.3338139 -0.2100288
## 5 6 7 8
## 1a1_q_002_S1_R1_001 0.53906726 0.677494289 -0.2143971 -0.53950313
## 1a3_q_004_S3_R1_001 0.04061909 0.087791367 -0.3582435 0.18574802
## 1a7_q_007_d8_S7_R1_001 0.40465292 -0.031086513 -0.1696025 0.02199784
## 1a8_q_008_d8_S8_R1_001 0.40058289 0.358359368 -0.3610603 0.14468767
## 1c6_q_028_S22_R1_001 -0.20400698 0.146627590 -0.1992854 0.49896429
## 1d3_q_038_S27_R1_001 0.11604338 -0.009315873 -0.2560809 -0.17461624
## 9 10 11 12
## 1a1_q_002_S1_R1_001 -0.09328228 -0.05176534 -0.34883557 -0.5686172
## 1a3_q_004_S3_R1_001 -0.11607911 -0.12476358 0.94170519 0.6524345
## 1a7_q_007_d8_S7_R1_001 -0.03715975 0.14627723 -0.06335732 0.7831266
## 1a8_q_008_d8_S8_R1_001 -0.08119719 -0.14666264 0.10727360 0.4921936
## 1c6_q_028_S22_R1_001 -0.06907810 0.04886612 0.18259940 -0.2771016
## 1d3_q_038_S27_R1_001 -0.04874525 0.48051250 -0.19548676 0.7062537
## 13 14 15
## 1a1_q_002_S1_R1_001 -0.61805695 -0.317859857 -0.5165267
## 1a3_q_004_S3_R1_001 0.06162628 -0.315313752 -0.6740801
## 1a7_q_007_d8_S7_R1_001 0.14337203 0.066701371 -0.0568714
## 1a8_q_008_d8_S8_R1_001 -0.16162715 -0.378009042 -0.6538674
## 1c6_q_028_S22_R1_001 -0.16897390 -0.341682014 -0.4940442
## 1d3_q_038_S27_R1_001 -0.26367077 0.002871791 -0.4328578
# adding sample description to data
data.sample<-kClustcentroids.any.trtlive.15 %>% as_tibble(rownames="sample") %>%
gather(cluster,value,-1) %>%
inner_join(sample.description.timecourse,by="sample") %>%
inner_join(cluster.any.trtlive.15.num,by="cluster") %>%
mutate(cluster.n=glue('{cluster2} \n({n2})',
cluster2=cluster,
n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
data.group<-data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% group_by(group.cluster) %>% summarize(group.cluster.mean=mean(value)) %>% inner_join(
data.sample %>% unite("group.cluster", c("group","cluster"),remove=FALSE) %>% dplyr::select("group.cluster","sampling_day","soil_trt","cluster.n","cluster"),by="group.cluster") %>% dplyr::slice(rep(1:1440)[!duplicated(.$group.cluster)])
# plot
p15.any.trtlive<- ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster))) +
geom_point() + geom_hline(yintercept=0,color="red") +
geom_line(data=data.group,aes(x=soil_trt,y=group.cluster.mean)) +
facet_grid(cluster.n~sampling_day) + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0))+
labs(title= "K-means clustering of two afternoon DEGs (live vs dead soil): fifteen clusters",color = "Cluster",y="scaled expression level")
p15.any.trtlive
ggsave(p15.any.trtlive,file="../output/Twoafternoon.any.trtlive.DEG.Kmean.15clusters.png",width=11,height=15)
# 8 Kmeans cluster (my way using enframe, which I am not satisfied)
temp<-tibble(transcript_ID=cpm.timecourse.v3.0.scale.twoafternoon.any.trtlive.DEG.spread$transcript_ID, cluster=kClusters.any.trtlive.8) %>% split(.$cluster) %>% map(function(x) {GOseq.Brgo.v3.0.Atgoslim.BP.list.ORA(genelist=x$transcript_ID)})
# convert list to data.frame
temp %>% enframe(name="cluster") %>% unnest(value) %>% write_csv(path="../output/twoafternoon.any.trtsoil.DEG.Kmeans.8cluster.csv")
Julin’s method
dge.diurnal34.any.trtlive.DEGs.all.v3.0anno %>% head()
# scaling expression data
cpm.timecourse.v3.0.scale<-t(scale(t(cpm.timecourse.v3.0[,-1]))) %>% as_tibble() %>% bind_cols(data.frame(transcript_ID=cpm.timecourse.v3.0$transcript_ID[]),.)
# diurnal 3and4 days DEG expression data (scaled)
cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG<-cpm.timecourse.v3.0.scale %>%
inner_join(dge.diurnal34.any.trtlive.DEGs.all.v3.0anno %>% filter(FDR<0.05) %>% dplyr::select(genes),by=c(transcript_ID="genes")) %>%
gather(sample,value,-1) %>% inner_join(sample.description.timecourse,by="sample") %>% filter(sampling_day %in% c("03","04")) #[1] 4406 121
## Warning: Column `transcript_ID`/`genes` joining factor and character vector,
## coercing into character vector
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
with(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG,table(sampling_day,sample)) # OK
## sample
## sampling_day 1a1_q_002_S1_R1_001 1a2_q_003_S2_R1_001 1a4_q_005_S4_R1_001
## 03 0 0 4406
## 04 4406 4406 0
## sample
## sampling_day 1a6_q_007_S6_R1_001 1b2_q_013_S10_R1_001 1b4_q_015_S12_R1_001
## 03 4406 0 4406
## 04 0 4406 0
## sample
## sampling_day 1b5_q_016_S13_R1_001 1b8_q_022_S16_R1_001 1c1_q_023_S17_R1_001
## 03 4406 0 4406
## 04 0 4406 0
## sample
## sampling_day 1c5_q_027_S21_R1_001 1c7_q_031_S23_R1_001 1c8_q_032_S24_R1_001
## 03 0 0 0
## 04 4406 4406 4406
## sample
## sampling_day 1d2_q_037_S26_R1_001 1d6_q_044_S30_R1_001 1e2_q_050_S34_R1_001
## 03 0 4406 4406
## 04 4406 0 0
## sample
## sampling_day 1f2_q_062_S42_R1_001 1f4_q_066_S44_R1_001 1f5_q_068_S45_R1_001
## 03 4406 0 4406
## 04 0 4406 0
## sample
## sampling_day 1f7_q_071_S47_R1_001 1f8_q_072_S48_R1_001 1g7_q_082_S55_R1_001
## 03 4406 0 4406
## 04 0 4406 0
## sample
## sampling_day 1h5_q_091_S61_R1_001 1h6_q_095_S62_R1_001 1h7_q_096_S63_R1_001
## 03 4406 0 4406
## 04 0 4406 0
## sample
## sampling_day 1i1_q_098_S65_R1_001 1i4_q_106_S68_R1_001 1i6_q_108_S70_R1_001
## 03 0 4406 0
## 04 4406 0 4406
## sample
## sampling_day 1i8_q_111_S72_R1_001 1j1_q_112_S73_R1_001 1j2_q_113_S74_R1_001
## 03 0 4406 0
## 04 4406 0 4406
## sample
## sampling_day 1j4_q_115_S76_R1_001 1j5_q_116_S77_R1_001 1j6_q_117_S78_R1_001
## 03 0 4406 4406
## 04 4406 0 0
## sample
## sampling_day 1j8_q_119_S80_R1_001 1k4_q_127_S84_R1_001 1k7_q_134_S87_R1_001
## 03 0 0 0
## 04 4406 4406 4406
## sample
## sampling_day 1l1_q_136_S89_R1_001 1l4_q_139_S92_R1_001 1l7_q_143_S95_R1_001
## 03 4406 4406 4406
## 04 0 0 0
## sample
## sampling_day 1l8_q_144_S96_R1_001 2a1_q_146_S97_R1_001 2a4_q_150_S100_R1_001
## 03 0 4406 0
## 04 4406 0 4406
## sample
## sampling_day 2a5_q_151_S101_R1_001 2a6_q_152_S102_R1_001 2b3_q_160_S107_R1_001
## 03 4406 0 4406
## 04 0 4406 0
## sample
## sampling_day 2c7_q_178_S119_R1_001 2c8_q_179_S120_R1_001 2d1_q_180_S121_R1_001
## 03 0 0 0
## 04 4406 4406 4406
## sample
## sampling_day 2d3_q_182_S123_R1_001 2d5_q_184_S125_R1_001 2e2_q_196_S130_R1_001
## 03 0 4406 4406
## 04 4406 0 0
## sample
## sampling_day 2e4_q_199_S132_R1_001 2e5_q_200_S133_R1_001 2e7_q_201_S135_R1_001
## 03 4406 0 4406
## 04 0 4406 0
## sample
## sampling_day 2e8_q_203_S136_R1_001 2f2_q_205_S138_R1_001 2f4_q_208_S140_R1_001
## 03 4406 0 0
## 04 0 4406 4406
## sample
## sampling_day 2f6_q_212_S142_R1_001 2f7_q_213_S143_R1_001 2f8_q_216_S144_R1_001
## 03 0 4406 4406
## 04 4406 0 0
## sample
## sampling_day 2g3_q_220_S147_R1_001 2g5_q_226_S149_R1_001 2g7_q_228_S151_R1_001
## 03 0 0 4406
## 04 4406 4406 0
## sample
## sampling_day 2h1_q_230_S153_R1_001 2h5_q_236_S157_R1_001 2h8_q_240_S160_R1_001
## 03 4406 4406 4406
## 04 0 0 0
## sample
## sampling_day 2i5_q_247_S165_R1_001 2i8_q_249_S168_R1_001 2j4_q_254_S172_R1_001
## 03 0 0 4406
## 04 4406 4406 0
## sample
## sampling_day 2j5_q_255_S173_R1_001 2k3_q_266_S179_R1_001 2k5_q_271_S181_R1_001
## 03 0 4406 0
## 04 4406 0 4406
## sample
## sampling_day 2k6_q_272_S182_R1_001 2k7_q_273_S183_R1_001 2k8_q_275_S184_R1_001
## 03 4406 0 4406
## 04 0 4406 0
## sample
## sampling_day 2l1_q_276_S185_R1_001 2l3_q_280_S187_R1_001 2l4_q_282_S188_R1_001
## 03 0 4406 0
## 04 4406 0 4406
## sample
## sampling_day 2l5_q_285_S189_R1_001 2l6_q_286_S190_R1_001 3a2_q_292_S194_R1_001
## 03 4406 0 4406
## 04 0 4406 0
## sample
## sampling_day 3a4_q_294_S196_R1_001 3a6_q_296_S198_R1_001 3b2_q_301_S202_R1_001
## 03 0 0 4406
## 04 4406 4406 0
## sample
## sampling_day 3b7_q_307_S207_R1_001 3b8_q_308_S208_R1_001 3c1_q_311_S209_R1_001
## 03 4406 0 4406
## 04 0 4406 0
## sample
## sampling_day 3c3_q_314_S211_R1_001 3c4_q_315_S212_R1_001 3c5_q_317_S213_R1_001
## 03 4406 0 4406
## 04 0 4406 0
## sample
## sampling_day 3c6_q_318_S214_R1_001 3d5_q_330_S221_R1_001 3d7_q_334_S223_R1_001
## 03 4406 4406 0
## 04 0 0 4406
## sample
## sampling_day 3d8_q_336_S224_R1_001 3e2_q_342_S226_R1_001 3e3_q_343_S227_R1_001
## 03 0 0 4406
## 04 4406 4406 0
## sample
## sampling_day 3e4_q_344_S228_R1_001 3e6_q_350_S230_R1_001 3f1_q_353_S233_R1_001
## 03 0 0 4406
## 04 4406 4406 0
## sample
## sampling_day 3f3_q_354_S235_R1_001 3g2_q_364_S242_R1_001 3g4_q_366_S244_R1_001
## 03 0 0 4406
## 04 4406 4406 0
## sample
## sampling_day 3g5_q_367_S245_R1_001 3h7_q_384_S255_R1_001 3i1_q_388_S257_R1_001
## 03 0 0 0
## 04 4406 4406 4406
## sample
## sampling_day 3i2_q_389_S258_R1_001 3i3_q_391_S259_R1_001 3i6_q_395_S262_R1_001
## 03 4406 4406 4406
## 04 0 0 0
## sample
## sampling_day 3i7_q_396_S263_R1_001 3j3_q_401_S267_R1_001 3j4_q_403_S268_R1_001
## 03 4406 0 0
## 04 0 4406 4406
## sample
## sampling_day 3j6_q_407_S270_R1_001 3j7_q_409_S271_R1_001 3k1_q_411_S273_R1_001
## 03 4406 4406 0
## 04 0 0 4406
## sample
## sampling_day 3k5_q_415_S277_R1_001 3l2_q_423_S282_R1_001 3l3_q_424_S283_R1_001
## 03 0 4406 4406
## 04 4406 0 0
## sample
## sampling_day 3l4_q_426_S284_R1_001 3l5_q_427_S285_R1_001 3l7_q_429_S287_R1_001
## 03 4406 0 0
## 04 0 4406 4406
# spread"
cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread<-cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG %>% dplyr::select(transcript_ID,sample,value) %>% spread(sample,value,-1)
dim(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread) # [1] 4406 121
## [1] 4406 121
# calculate wss
wss <- (nrow(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1])-1)*sum(apply(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1],2,var))
for (i in 2:30) wss[i] <- sum(kmeans(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1],
centers=i,iter.max = 20)$withinss) # If default iter.max=10 gave me "did not converge in 10 iterations" error. Solution: https://r.789695.n4.nabble.com/kmeans-quot-did-not-converge-in-10-iterations-quot-td797019.html.
plot(1:30, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
Let’s perform the actual clsutering using K=6:
set.seed(20)
kClust.diurnal34.any.trt.6 <- kmeans(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1], centers=6, nstart = 1000, iter.max = 20)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 220300)
kClusters.diurnal34.any.trt.6 <- kClust.diurnal34.any.trt.6$cluster
# number of clusters
cluster.diurnal34.any.trt.6.num<-tibble(cluster=kClusters.diurnal34.any.trt.6) %>% group_by(cluster) %>% summarize(n=n())
cluster.diurnal34.any.trt.6.num$cluster<-as.character(cluster.diurnal34.any.trt.6.num$cluster) # classic way
cluster.diurnal34.any.trt.6.num
Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster i
kClustcentroids.diurnal34.any.trt.6 <- sapply(levels(factor(kClusters.diurnal34.any.trt.6)), clust.centroid, cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1], kClusters.diurnal34.any.trt.6)
kClustcentroids.diurnal34.any.trt.6 %>% head()
## 1 2 3 4
## 1a1_q_002_S1_R1_001 0.15394279 0.574503587 -0.1346771004 -0.4374420358
## 1a2_q_003_S2_R1_001 -0.18939602 0.875668492 0.0141788125 0.0003232468
## 1a4_q_005_S4_R1_001 0.04440172 0.544444068 -0.1535801774 -0.5019607167
## 1a6_q_007_S6_R1_001 -0.48949297 -0.008991221 0.0009558034 0.6395503254
## 1b2_q_013_S10_R1_001 -0.51370767 -0.243401224 0.0986233744 1.3543390190
## 1b4_q_015_S12_R1_001 -0.62238828 -0.316876093 0.0895423127 1.1728175746
## 5 6
## 1a1_q_002_S1_R1_001 -0.6981031 -0.1141115
## 1a2_q_003_S2_R1_001 -0.7348265 0.1399330
## 1a4_q_005_S4_R1_001 -0.7957776 -0.2638425
## 1a6_q_007_S6_R1_001 0.3361567 0.6853755
## 1b2_q_013_S10_R1_001 0.3719169 0.8161256
## 1b4_q_015_S12_R1_001 0.4386614 0.6863338
# adding sample description to data
data.sample<-kClustcentroids.diurnal34.any.trt.6 %>% as_tibble(rownames="sample") %>%
gather(cluster,value,-1) %>%
inner_join(sample.description.timecourse,by="sample") %>%
inner_join(cluster.diurnal34.any.trt.6.num,by="cluster") %>%
mutate(cluster.n=glue::glue('{cluster2} \n({n2})',
cluster2=cluster,
n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
data.group<-data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% group_by(sampling_time.soil.cluster) %>% summarize(sampling_time.soil.cluster.mean=mean(value)) %>% inner_join(
data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% dplyr::select("sampling_time.soil.cluster","sampling_time","sampling_day","soil_trt","cluster.n","cluster"),by="sampling_time.soil.cluster") %>% dplyr::slice(rep(1:1800)[!duplicated(.$sampling_time.soil.cluster)])
# plot
p6.diurnal34.any.trt<- ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster),shape=sampling_day)) +
geom_jitter(alpha=0.5,width=0.25) + geom_hline(yintercept=0,color="red") +
geom_line(data=data.group,aes(x=soil_trt,y=sampling_time.soil.cluster.mean)) +
facet_grid(cluster.n~sampling_time,scales="free") + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0),legend.position="none")+
labs(title= "K-means clustering of diurnal DEGs (day 3 and 4): six clusters",color = "Cluster",y="scaled expression level")
p6.diurnal34.any.trt
ggsave(p6.diurnal34.any.trt,file="../output/diurnal34.any.trt.DEG.Kmean.6clusters.png",width=11,height=6)
Let’s perform the actual clsutering using K=15:
set.seed(20)
kClust.diurnal34.any.trt.15 <- kmeans(cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1], centers=15, nstart = 1000, iter.max = 20)
kClusters.diurnal34.any.trt.15 <- kClust.diurnal34.any.trt.15$cluster
# number of clusters
cluster.diurnal34.any.trt.15.num<-tibble(cluster=kClusters.diurnal34.any.trt.15) %>% group_by(cluster) %>% summarize(n=n())
cluster.diurnal34.any.trt.15.num$cluster<-as.character(cluster.diurnal34.any.trt.15.num$cluster) # classic way
cluster.diurnal34.any.trt.15.num
Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster i
kClustcentroids.diurnal34.any.trt.15 <- sapply(levels(factor(kClusters.diurnal34.any.trt.15)), clust.centroid, cpm.timecourse.v3.0.scale.diurnal34.any.trt.DEG.spread[,-1], kClusters.diurnal34.any.trt.15)
kClustcentroids.diurnal34.any.trt.15 %>% head()
## 1 2 3 4
## 1a1_q_002_S1_R1_001 -0.64961818 -0.02479961 -0.289615400 -0.9786539
## 1a2_q_003_S2_R1_001 -0.75566949 0.32130679 0.381780630 -0.4732351
## 1a4_q_005_S4_R1_001 -0.83250586 -0.12224420 -0.344503252 -0.7526523
## 1a6_q_007_S6_R1_001 -0.06230493 0.65223688 -0.040911215 1.1752934
## 1b2_q_013_S10_R1_001 0.16136731 0.86825638 0.010317928 0.7885374
## 1b4_q_015_S12_R1_001 0.12967927 0.76800694 -0.003837876 1.2768045
## 5 6 7 8 9
## 1a1_q_002_S1_R1_001 -0.38370094 -0.1313879 -0.4915427 -0.3730555 -0.1059656
## 1a2_q_003_S2_R1_001 0.06193626 -0.5236392 -0.2458315 -0.7402790 0.4217254
## 1a4_q_005_S4_R1_001 -0.24566920 -0.1293080 -0.4491374 -0.8180449 -0.4273409
## 1a6_q_007_S6_R1_001 0.27824225 -0.4460591 0.1487576 0.7037006 0.3453350
## 1b2_q_013_S10_R1_001 0.68953710 -0.6315544 0.3617727 0.5376185 1.2756411
## 1b4_q_015_S12_R1_001 0.59780985 -0.7326287 0.1385924 0.4173258 1.2709022
## 10 11 12 13
## 1a1_q_002_S1_R1_001 -0.4651650 0.74986457 0.33568695 0.5109533
## 1a2_q_003_S2_R1_001 -0.2048314 0.03310414 -0.28147008 1.8647825
## 1a4_q_005_S4_R1_001 -0.5358867 -0.03666176 0.09234978 1.0800513
## 1a6_q_007_S6_R1_001 0.7814266 -0.44958670 0.57106597 -0.2548949
## 1b2_q_013_S10_R1_001 1.7723968 -0.11230894 -0.46208247 -0.1866692
## 1b4_q_015_S12_R1_001 1.3563270 -0.24063026 -0.56602888 -0.2174072
## 14 15
## 1a1_q_002_S1_R1_001 -0.0798020913 0.1730526
## 1a2_q_003_S2_R1_001 -0.0626269210 0.4743356
## 1a4_q_005_S4_R1_001 -0.1411833436 0.4841148
## 1a6_q_007_S6_R1_001 -0.0003048739 -0.5710707
## 1b2_q_013_S10_R1_001 0.0193753243 -0.5400588
## 1b4_q_015_S12_R1_001 0.0260617892 -0.6416530
# adding sample description to data
data.sample<-kClustcentroids.diurnal34.any.trt.15 %>% as_tibble(rownames="sample") %>%
gather(cluster,value,-1) %>%
inner_join(sample.description.timecourse,by="sample") %>%
inner_join(cluster.diurnal34.any.trt.15.num,by="cluster") %>%
mutate(cluster.n=glue::glue('{cluster2} \n({n2})',
cluster2=cluster,
n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
data.group<-data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% group_by(sampling_time.soil.cluster) %>% summarize(sampling_time.soil.cluster.mean=mean(value)) %>% inner_join(
data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% dplyr::select("sampling_time.soil.cluster","sampling_time","sampling_day","soil_trt","cluster.n","cluster"),by="sampling_time.soil.cluster") %>% dplyr::slice(rep(1:1800)[!duplicated(.$sampling_time.soil.cluster)])
# plot
p15.diurnal34.any.trt<- ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster),shape=sampling_day)) +
geom_jitter(alpha=0.2) + geom_hline(yintercept=0,color="red") +
geom_line(data=data.group,aes(x=soil_trt,y=sampling_time.soil.cluster.mean)) +
facet_grid(cluster.n~sampling_time,scales="free") + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0),legend.position="none")+
labs(title= "K-means clustering of diurnal DEGs (day 3 and 4): fifteen clusters",color = "Cluster",y="scaled expression level")
p15.diurnal34.any.trt
ggsave(p15.diurnal34.any.trt,file="../output/diurnal34.any.trt.DEG.Kmean.15clusters.png",width=11,height=15)
dge.diurnal1314.any.trtlive.DEGs.all.v3.0anno %>% head()
# scaling expression data
cpm.timecourse.v3.0.scale %>% head()
# diurnal 3and4 days DEG expression data (scaled)
cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG<-cpm.timecourse.v3.0.scale %>%
inner_join(dge.diurnal1314.any.trtlive.DEGs.all.v3.0anno %>% filter(FDR<0.05) %>% dplyr::select(genes),by=c(transcript_ID="genes")) %>%
gather(sample,value,-1) %>% inner_join(sample.description.timecourse,by="sample") %>% filter(sampling_day %in% c("13","14")) #[1] 11886 121
## Warning: Column `transcript_ID`/`genes` joining factor and character vector,
## coercing into character vector
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
with(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG,table(sampling_day,sample)) # OK
## sample
## sampling_day 1a5_q_006_S5_R1_001 1b1_q_012_S9_R1_001 1b3_q_014_S11_R1_001
## 13 415 0 415
## 14 0 415 0
## sample
## sampling_day 1b6_q_017_S14_R1_001 1b7_q_020_S15_R1_001 1c2_q_024_S18_R1_001
## 13 0 415 0
## 14 415 0 415
## sample
## sampling_day 1c3_q_025_S19_R1_001 1c4_q_026_S20_R1_001 1c6_q_028_S22_R1_001
## 13 0 0 0
## 14 415 415 415
## sample
## sampling_day 1d1_q_035_S25_R1_001 1d4_q_040_S28_R1_001 1d5_q_042_S29_R1_001
## 13 0 415 415
## 14 415 0 0
## sample
## sampling_day 1d7_q_045_S31_R1_001 1d8_q_046_S32_R1_001 1e1_q_048_S33_R1_001
## 13 0 415 415
## 14 415 0 0
## sample
## sampling_day 1e3_q_053_S35_R1_001 1e4_q_055_S36_R1_001 1e7_q_058_S39_R1_001
## 13 415 0 0
## 14 0 415 415
## sample
## sampling_day 1f1_q_060_S41_R1_001 1f6_q_070_S46_R1_001 1g1_q_073_S49_R1_001
## 13 415 415 0
## 14 0 0 415
## sample
## sampling_day 1g2_q_074_S50_R1_001 1g4_q_077_S52_R1_001 1g5_q_080_S53_R1_001
## 13 415 415 415
## 14 0 0 0
## sample
## sampling_day 1g6_q_081_S54_R1_001 1g8_q_083_S56_R1_001 1h1_q_084_S57_R1_001
## 13 0 0 0
## 14 415 415 415
## sample
## sampling_day 1h2_q_085_S58_R1_001 1h8_q_097_S64_R1_001 1i3_q_105_S67_R1_001
## 13 415 0 0
## 14 0 415 415
## sample
## sampling_day 1i5_q_107_S69_R1_001 1i7_q_110_S71_R1_001 1j3_q_114_S75_R1_001
## 13 0 0 415
## 14 415 415 0
## sample
## sampling_day 1k1_q_120_S81_R1_001 1k3_q_123_S83_R1_001 1k5_q_128_S85_R1_001
## 13 415 415 415
## 14 0 0 0
## sample
## sampling_day 1l2_q_137_S90_R1_001 1l3_q_138_S91_R1_001 1l5_q_141_S93_R1_001
## 13 415 415 0
## 14 0 0 415
## sample
## sampling_day 1l6_q_142_S94_R1_001 2a2_q_147_S98_R1_001 2a3_q_148_S99_R1_001
## 13 0 415 415
## 14 415 0 0
## sample
## sampling_day 2a8_q_154_S104_R1_001 2b1_q_156_S105_R1_001 2b2_q_158_S106_R1_001
## 13 415 415 0
## 14 0 0 415
## sample
## sampling_day 2b4_q_161_S108_R1_001 2b5_q_162_S109_R1_001 2b6_q_164_S110_R1_001
## 13 0 415 0
## 14 415 0 415
## sample
## sampling_day 2c1_q_168_S113_R1_001 2c2_q_169_S114_R1_001 2c5_q_173_S117_R1_001
## 13 415 415 0
## 14 0 0 415
## sample
## sampling_day 2c6_q_175_S118_R1_001 2d4_q_183_S124_R1_001 2d6_q_185_S126_R1_001
## 13 0 0 0
## 14 415 415 415
## sample
## sampling_day 2d8_q_190_S128_R1_001 2e1_q_193_S129_R1_001 2e3_q_198_S131_R1_001
## 13 415 0 415
## 14 0 415 0
## sample
## sampling_day 2f1_q_204_S137_R1_001 2f3_q_206_S139_R1_001 2f5_q_211_S141_R1_001
## 13 0 415 0
## 14 415 0 415
## sample
## sampling_day 2g2_q_219_S146_R1_001 2g6_q_227_S150_R1_001 2g8_q_229_S152_R1_001
## 13 415 0 415
## 14 0 415 0
## sample
## sampling_day 2h2_q_231_S154_R1_001 2h3_q_232_S155_R1_001 2h6_q_237_S158_R1_001
## 13 0 415 0
## 14 415 0 415
## sample
## sampling_day 2h7_q_238_S159_R1_001 2i2_q_243_S162_R1_001 2i3_q_245_S163_R1_001
## 13 0 415 415
## 14 415 0 0
## sample
## sampling_day 2i4_q_246_S164_R1_001 2i7_q_248_S167_R1_001 2j2_q_252_S170_R1_001
## 13 415 0 415
## 14 0 415 0
## sample
## sampling_day 2j3_q_253_S171_R1_001 2j8_q_261_S176_R1_001 2k1_q_263_S177_R1_001
## 13 0 415 415
## 14 415 0 0
## sample
## sampling_day 2k2_q_265_S178_R1_001 2k4_q_267_S180_R1_001 2l2_q_278_S186_R1_001
## 13 0 0 415
## 14 415 415 0
## sample
## sampling_day 2l7_q_287_S191_R1_001 2l8_q_288_S192_R1_001 3a3_q_293_S195_R1_001
## 13 0 0 415
## 14 415 415 0
## sample
## sampling_day 3a5_q_295_S197_R1_001 3a7_q_297_S199_R1_001 3a8_q_299_S200_R1_001
## 13 415 415 0
## 14 0 0 415
## sample
## sampling_day 3b1_q_300_S201_R1_001 3b3_q_302_S203_R1_001 3b4_q_303_S204_R1_001
## 13 0 0 0
## 14 415 415 415
## sample
## sampling_day 3b6_q_306_S206_R1_001 3c2_q_312_S210_R1_001 3c8_q_323_S216_R1_001
## 13 415 415 0
## 14 0 0 415
## sample
## sampling_day 3d1_q_324_S217_R1_001 3d2_q_325_S218_R1_001 3d3_q_326_S219_R1_001
## 13 0 415 415
## 14 415 0 0
## sample
## sampling_day 3d4_q_329_S220_R1_001 3e1_q_339_S225_R1_001 3e5_q_348_S229_R1_001
## 13 415 415 0
## 14 0 0 415
## sample
## sampling_day 3e7_q_351_S231_R1_001 3e8_q_352_S232_R1_001 3f5_q_358_S237_R1_001
## 13 0 415 0
## 14 415 0 415
## sample
## sampling_day 3f6_q_359_S238_R1_001 3g1_q_362_S241_R1_001 3g3_q_365_S243_R1_001
## 13 0 0 0
## 14 415 415 415
## sample
## sampling_day 3g6_q_369_S246_R1_001 3g7_q_370_S247_R1_001 3g8_q_371_S248_R1_001
## 13 415 0 415
## 14 0 415 0
## sample
## sampling_day 3h1_q_372_S249_R1_001 3h4_q_376_S252_R1_001 3h6_q_378_S254_R1_001
## 13 415 0 415
## 14 0 415 0
## sample
## sampling_day 3i5_q_393_S261_R1_001 3i8_q_397_S264_R1_001 3j1_q_398_S265_R1_001
## 13 415 415 0
## 14 0 0 415
## sample
## sampling_day 3j2_q_399_S266_R1_001 3j5_q_405_S269_R1_001 3j8_q_410_S272_R1_001
## 13 0 0 415
## 14 415 415 0
## sample
## sampling_day 3k2_q_412_S274_R1_001 3k3_q_413_S275_R1_001 3k4_q_414_S276_R1_001
## 13 415 415 0
## 14 0 0 415
## sample
## sampling_day 3k8_q_420_S280_R1_001 3l6_q_428_S286_R1_001 3l8_q_432_S288_R1_001
## 13 0 415 0
## 14 415 0 415
# spread"
cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread<-cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG %>% dplyr::select(transcript_ID,sample,value) %>% spread(sample,value,-1)
dim(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread) # [1] 11886 121
## [1] 415 121
# calculate wss
wss <- (nrow(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1])-1)*sum(apply(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1],2,var))
for (i in 2:30) wss[i] <- sum(kmeans(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1],
centers=i,iter.max = 20)$withinss) # If default iter.max=10 gave me "did not converge in 10 iterations" error. Solution: https://r.789695.n4.nabble.com/kmeans-quot-did-not-converge-in-10-iterations-quot-td797019.html.
plot(1:30, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
Let’s perform the actual clsutering using K=6:
set.seed(20)
kClust.diurnal1314.any.trt.6 <- kmeans(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1], centers=6, nstart = 1000, iter.max = 20)
kClusters.diurnal1314.any.trt.6 <- kClust.diurnal1314.any.trt.6$cluster
# number of clusters
cluster.diurnal1314.any.trt.6.num<-tibble(cluster=kClusters.diurnal1314.any.trt.6) %>% group_by(cluster) %>% summarize(n=n())
cluster.diurnal1314.any.trt.6.num$cluster<-as.character(cluster.diurnal1314.any.trt.6.num$cluster) # classic way
cluster.diurnal1314.any.trt.6.num
Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster i
kClustcentroids.diurnal1314.any.trt.6 <- sapply(levels(factor(kClusters.diurnal1314.any.trt.6)), clust.centroid, cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1], kClusters.diurnal1314.any.trt.6)
kClustcentroids.diurnal1314.any.trt.6 %>% head()
## 1 2 3 4 5
## 1a5_q_006_S5_R1_001 -0.27278698 0.3226640 0.02428788 -0.22817029 -0.09108589
## 1b1_q_012_S9_R1_001 -0.09515938 0.3911190 -0.06995477 0.69649873 -0.56483373
## 1b3_q_014_S11_R1_001 -0.02012877 0.8087725 -0.09366928 0.40835854 -0.02250425
## 1b6_q_017_S14_R1_001 -0.06340790 1.0743089 -0.07322537 -0.04130941 -0.14094087
## 1b7_q_020_S15_R1_001 0.08215561 -0.2202796 -0.09172485 0.94910468 -0.48629465
## 1c2_q_024_S18_R1_001 -0.18829890 0.2982033 -0.13371672 0.29951480 -0.47074934
## 6
## 1a5_q_006_S5_R1_001 0.75590838
## 1b1_q_012_S9_R1_001 -0.28492633
## 1b3_q_014_S11_R1_001 0.50962342
## 1b6_q_017_S14_R1_001 0.58617165
## 1b7_q_020_S15_R1_001 -0.20442966
## 1c2_q_024_S18_R1_001 -0.07883211
# adding sample description to data
data.sample<-kClustcentroids.diurnal1314.any.trt.6 %>% as_tibble(rownames="sample") %>%
gather(cluster,value,-1) %>%
inner_join(sample.description.timecourse,by="sample") %>%
inner_join(cluster.diurnal1314.any.trt.6.num,by="cluster") %>%
mutate(cluster.n=glue::glue('{cluster2} \n({n2})',
cluster2=cluster,
n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
data.group<-data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% group_by(sampling_time.soil.cluster) %>% summarize(sampling_time.soil.cluster.mean=mean(value)) %>% inner_join(
data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% dplyr::select("sampling_time.soil.cluster","sampling_time","sampling_day","soil_trt","cluster.n","cluster"),by="sampling_time.soil.cluster") %>% dplyr::slice(rep(1:1800)[!duplicated(.$sampling_time.soil.cluster)])
# plot
p6.diurnal1314.any.trt<- ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster),shape=sampling_day)) +
geom_jitter(alpha=0.5,width=0.25) + geom_hline(yintercept=0,color="red") +
geom_line(data=data.group,aes(x=soil_trt,y=sampling_time.soil.cluster.mean)) +
facet_grid(cluster.n~sampling_time,scales="free") + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0),legend.position="none")+
labs(title= "K-means clustering of diurnal DEGs (day 13 and 14): six clusters",color = "Cluster",y="scaled expression level")
p6.diurnal1314.any.trt
ggsave(p6.diurnal1314.any.trt,file="../output/diurnal1314.any.trt.DEG.Kmean.6clusters.png",width=11,height=6) # 15 clusters... why????
Let’s perform the actual clsutering using K=15:
set.seed(20)
kClust.diurnal1314.any.trt.15 <- kmeans(cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1], centers=15, nstart = 1000, iter.max = 20)
kClusters.diurnal1314.any.trt.15 <- kClust.diurnal1314.any.trt.15$cluster
# number of clusters
cluster.diurnal1314.any.trt.15.num<-tibble(cluster=kClusters.diurnal1314.any.trt.15) %>% group_by(cluster) %>% summarize(n=n())
cluster.diurnal1314.any.trt.15.num$cluster<-as.character(cluster.diurnal1314.any.trt.15.num$cluster) # classic way
cluster.diurnal1314.any.trt.15.num
Now we can calculate the cluster ‘cores’ aka centroids: # function to find centroid in cluster i
kClustcentroids.diurnal1314.any.trt.15 <- sapply(levels(factor(kClusters.diurnal1314.any.trt.15)), clust.centroid, cpm.timecourse.v3.0.scale.diurnal1314.any.trt.DEG.spread[,-1], kClusters.diurnal1314.any.trt.15)
kClustcentroids.diurnal1314.any.trt.15 %>% head()
## 1 2 3 4 5
## 1a5_q_006_S5_R1_001 0.04742393 -0.09217499 0.06968819 -0.01939530 0.37585514
## 1b1_q_012_S9_R1_001 1.03158347 -0.33091601 0.06125328 -0.10626701 0.06704349
## 1b3_q_014_S11_R1_001 0.35433674 0.19127711 0.59181089 -0.07592017 -0.06355553
## 1b6_q_017_S14_R1_001 0.09880705 0.86363272 0.31282606 -0.04667052 0.57680721
## 1b7_q_020_S15_R1_001 1.32429031 -0.70446229 -0.41644143 -0.11020982 -0.50597283
## 1c2_q_024_S18_R1_001 0.35385751 -0.50915924 0.10693178 -0.12079579 0.19183548
## 6 7 8 9 10
## 1a5_q_006_S5_R1_001 -0.2946402 -0.2663661 -0.44336291 0.8137632 0.1135254
## 1b1_q_012_S9_R1_001 -0.6416759 0.5529310 -0.22453618 0.5023845 -0.5078942
## 1b3_q_014_S11_R1_001 -0.2651513 0.4863177 -0.04005012 0.5536828 0.3295724
## 1b6_q_017_S14_R1_001 -0.2105430 -0.4760724 -0.15349776 1.8412404 -0.1851687
## 1b7_q_020_S15_R1_001 -0.5645009 0.4739545 -0.12256037 -0.1077408 -0.3537980
## 1c2_q_024_S18_R1_001 -0.4589185 0.1999366 -0.11381218 0.3331559 -0.5232549
## 11 12 13 14 15
## 1a5_q_006_S5_R1_001 0.6889138 0.9788134 -0.45833208 0.9661078 -0.38559244
## 1b1_q_012_S9_R1_001 0.6638472 -0.4843574 -0.13231957 0.2522155 0.90530279
## 1b3_q_014_S11_R1_001 -0.7114466 0.5190911 0.10275010 1.2747567 0.44640559
## 1b6_q_017_S14_R1_001 0.3208429 0.1038389 0.03302613 1.3564667 -0.02622145
## 1b7_q_020_S15_R1_001 0.7455260 -0.2624090 -0.08730064 0.3794383 1.34149861
## 1c2_q_024_S18_R1_001 -0.6527750 -0.1144491 0.10468412 0.0556180 0.33083885
# adding sample description to data
data.sample<-kClustcentroids.diurnal1314.any.trt.15 %>% as_tibble(rownames="sample") %>%
gather(cluster,value,-1) %>%
inner_join(sample.description.timecourse,by="sample") %>%
inner_join(cluster.diurnal1314.any.trt.15.num,by="cluster") %>%
mutate(cluster.n=glue::glue('{cluster2} \n({n2})',
cluster2=cluster,
n2=n) )
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# data.group for adding group cluster mean
data.group<-data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% group_by(sampling_time.soil.cluster) %>% summarize(sampling_time.soil.cluster.mean=mean(value)) %>% inner_join(
data.sample %>% unite("sampling_time.soil.cluster", c("sampling_time","soil_trt","cluster"),remove=FALSE) %>% dplyr::select("sampling_time.soil.cluster","sampling_time","sampling_day","soil_trt","cluster.n","cluster"),by="sampling_time.soil.cluster") %>% dplyr::slice(rep(1:1800)[!duplicated(.$sampling_time.soil.cluster)])
# plot
p15.diurnal1314.any.trt<- ggplot(data.sample,aes(x=soil_trt,y=value, group=cluster, colour=as.factor(cluster),shape=sampling_day)) +
geom_jitter(alpha=0.2) + geom_hline(yintercept=0,color="red") +
geom_line(data=data.group,aes(x=soil_trt,y=sampling_time.soil.cluster.mean)) +
facet_grid(cluster.n~sampling_time,scales="free") + theme(axis.text=element_text(angle=90),strip.text.y=element_text(angle=0),legend.position="none")+
labs(title= "K-means clustering of diurnal DEGs (day 13 and 14): fifteen clusters",color = "Cluster",y="scaled expression level")
p15.diurnal1314.any.trt
ggsave(p15.diurnal1314.any.trt,file="../output/diurnal1314.any.trt.DEG.Kmean.15clusters.png",width=11,height=15)
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] annotate_1.64.0 XML_3.99-0.3
## [3] GO.db_3.10.0 AnnotationDbi_1.48.0
## [5] goseq_1.38.0 geneLenDataBase_1.22.0
## [7] BiasedUrn_1.07 ShortRead_1.44.3
## [9] GenomicAlignments_1.22.1 SummarizedExperiment_1.16.1
## [11] DelayedArray_0.12.2 matrixStats_0.55.0
## [13] Biobase_2.46.0 Rsamtools_2.2.3
## [15] GenomicRanges_1.38.0 GenomeInfoDb_1.22.0
## [17] Biostrings_2.54.0 XVector_0.26.0
## [19] IRanges_2.20.2 S4Vectors_0.24.3
## [21] BiocParallel_1.20.1 BiocGenerics_0.32.0
## [23] cowplot_1.0.0 glue_1.3.1
## [25] readxl_1.3.1 forcats_0.5.0
## [27] stringr_1.4.0 dplyr_0.8.4
## [29] purrr_0.3.3 readr_1.3.1
## [31] tidyr_1.0.2 tibble_2.1.3
## [33] ggplot2_3.2.1 tidyverse_1.3.0
## [35] edgeR_3.28.1 limma_3.42.2
## [37] knitr_1.28
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 hwriter_1.3.2 ellipsis_0.3.0
## [4] fs_1.3.1 rstudioapi_0.11 farver_2.0.3
## [7] bit64_0.9-7 fansi_0.4.1 lubridate_1.7.4
## [10] xml2_1.2.2 splines_3.6.2 jsonlite_1.6.1
## [13] broom_0.5.5 dbplyr_1.4.2 png_0.1-7
## [16] compiler_3.6.2 httr_1.4.1 backports_1.1.5
## [19] assertthat_0.2.1 Matrix_1.2-18 lazyeval_0.2.2
## [22] cli_2.0.2 htmltools_0.4.0 prettyunits_1.1.1
## [25] tools_3.6.2 gtable_0.3.0 GenomeInfoDbData_1.2.2
## [28] reshape2_1.4.3 rappdirs_0.3.1 Rcpp_1.0.3
## [31] cellranger_1.1.0 vctrs_0.2.3 nlme_3.1-144
## [34] rtracklayer_1.46.0 xfun_0.12 rvest_0.3.5
## [37] lifecycle_0.1.0 zlibbioc_1.32.0 scales_1.1.0
## [40] hms_0.5.3 RColorBrewer_1.1-2 curl_4.3
## [43] yaml_2.2.1 memoise_1.1.0 biomaRt_2.42.0
## [46] latticeExtra_0.6-29 stringi_1.4.6 RSQLite_2.2.0
## [49] GenomicFeatures_1.38.2 rlang_0.4.5 pkgconfig_2.0.3
## [52] bitops_1.0-6 evaluate_0.14 lattice_0.20-40
## [55] labeling_0.3 bit_1.1-15.2 tidyselect_1.0.0
## [58] plyr_1.8.5 magrittr_1.5 R6_2.4.1
## [61] generics_0.0.2 DBI_1.1.0 mgcv_1.8-31
## [64] pillar_1.4.3 haven_2.2.0 withr_2.1.2
## [67] RCurl_1.98-1.1 modelr_0.1.6 crayon_1.3.4
## [70] BiocFileCache_1.10.2 rmarkdown_2.1 jpeg_0.1-8.1
## [73] progress_1.2.2 locfit_1.5-9.1 grid_3.6.2
## [76] blob_1.2.1 reprex_0.3.0 digest_0.6.25
## [79] xtable_1.8-4 openssl_1.4.1 munsell_0.5.0
## [82] askpass_1.1